# This script prepares the different oversampled and undersampled training datasets to develop the early life CAPE models. 
# These early life training datasets prepared in this script will have had the following optimisation techniques applied: ADASYN oversampling and/or random undersampling to give 1:1 class balance
# Once the data is prepared, this script needs to be immediately followed by: "Model_development_XXX.txt", where XXX is the name of the different algorithms considered. 
# The data in file "Early_life_standardised_initial_training_dataset_510IDs.csv" is found in IOWBC_training_test_data.xlsx, sheet: "Standardised earlylife training"
# The data in files named "Oversampled_earlylife_dataset_XXX.csv" were developed using the script "Data_preparation_CAPE_oversampling.txt (data can be found in XXX).
# Python version 3.6.8 was used 

# Set working directory
os.chdir("/../../")

# Imports
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.utils import shuffle

#######################
### Import datasets ###
#######################
# Construct both oversampled and undersampled datasets in the following way:
# data_0 = complete case		data_0_U = complete case, undersampled
# data_25_O = 25% oversampled cases		data_25_OU = 25% oversampled cases, undersampled controls to 1:1 class ratio

data_0 = pd.read_csv("Early_life_standardised_initial_training_dataset_510IDs.csv", index_col=False)
print('Original dataset shape %s' % Counter(data_0.Asthma_10YR))
# Original dataset shape Counter({0: 442, 1: 68})

# Undersample the controls 
s1 = data_0.loc[data_0['Asthma_10YR'] == 1]
s0 = data_0.loc[data_0['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:68,]
data_0_U = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_0_U = shuffle(data_0_U, random_state=123)
print('Original dataset shape %s' % Counter(data_0_U.Asthma_10YR))
# Original dataset shape Counter({0: 68, 1: 68})


data_25_O = pd.read_csv("Oversampled_earlylife_dataset_25%.csv", index_col=False)
data_25_O = data_25_O.iloc[0:527,:]
print('Original dataset shape %s' % Counter(data_25_O.Asthma_10YR))
# Original dataset shape Counter({0: 442, 1: 85})

# Undersample the controls 
s1 = data_25_O.loc[data_25_O['Asthma_10YR'] == 1]
s0 = data_25_O.loc[data_25_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:85,]
data_25_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_25_OU = shuffle(data_25_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_25_OU.Asthma_10YR))
# Original dataset shape Counter({0: 85, 1: 85})


data_50_O = pd.read_csv("Oversampled_earlylife_dataset_50%.csv", index_col=False)
data_50_O = data_50_O.iloc[0:544,:]
print('Original dataset shape %s' % Counter(data_50_O.Asthma_10YR))
# Original dataset shape Counter({0: 442, 1: 102})

# Undersample the controls 
s1 = data_50_O.loc[data_50_O['Asthma_10YR'] == 1]
s0 = data_50_O.loc[data_50_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:102,]
data_50_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_50_OU = shuffle(data_50_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_50_OU.Asthma_10YR))
# Original dataset shape Counter({0: 102, 1: 102})


data_100_O = pd.read_csv("Oversampled_earlylife_dataset_100%.csv", index_col=False)
data_100_O = data_100_O.iloc[0:578,:]
print('Original dataset shape %s' % Counter(data_100_O.Asthma_10YR))
# Original dataset shape Counter({0: 442, 1: 136})

# Undersample the controls 
s1 = data_100_O.loc[data_100_O['Asthma_10YR'] == 1]
s0 = data_100_O.loc[data_100_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:136,]
data_100_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_100_OU = shuffle(data_100_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_100_OU.Asthma_10YR))
# Original dataset shape Counter({0: 136, 1: 136})


data_150_O = pd.read_csv("Oversampled_earlylife_dataset_150%.csv", index_col=False)
data_150_O = data_150_O.iloc[0:612,:]
print('Original dataset shape %s' % Counter(data_150_O.Asthma_10YR))
# Original dataset shape Counter({0: 442, 1: 170})

# Undersample the controls 
s1 = data_150_O.loc[data_150_O['Asthma_10YR'] == 1]
s0 = data_150_O.loc[data_150_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:170,]
data_150_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_150_OU = shuffle(data_150_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_150_OU.Asthma_10YR))
# Original dataset shape Counter({0: 170, 1: 170})


data_200_O = pd.read_csv("Oversampled_earlylife_dataset_200%.csv", index_col=False)
data_200_O = data_200_O.iloc[0:646,:]
print('Original dataset shape %s' % Counter(data_200_O.Asthma_10YR))
# Original dataset shape Counter({0: 442, 1: 204})

# Undersample the controls 
s1 = data_200_O.loc[data_200_O['Asthma_10YR'] == 1]
s0 = data_200_O.loc[data_200_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:204,]
data_200_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_200_OU = shuffle(data_200_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_200_OU.Asthma_10YR))
# Original dataset shape Counter({0: 204, 1: 204})


data_250_O = pd.read_csv("Oversampled_earlylife_dataset_250%.csv", index_col=False)
data_250_O = data_250_O.iloc[0:680,:]
print('Original dataset shape %s' % Counter(data_250_O.Asthma_10YR))
#Original dataset shape Counter({0: 442, 1: 238})

# Undersample the controls 
s1 = data_250_O.loc[data_250_O['Asthma_10YR'] == 1]
s0 = data_250_O.loc[data_250_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:238,]
data_250_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_250_OU = shuffle(data_250_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_250_OU.Asthma_10YR))
# Original dataset shape Counter({1: 238, 0: 238})


data_300_O = pd.read_csv("Oversampled_earlylife_dataset_300%.csv", index_col=False)
data_300_O = data_300_O.iloc[0:714,:]
print('Original dataset shape %s' % Counter(data_300_O.Asthma_10YR))
# Original dataset shape Counter({0: 442, 1: 272})

# Undersample the controls 
s1 = data_300_O.loc[data_300_O['Asthma_10YR'] == 1]
s0 = data_300_O.loc[data_300_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:272,]
data_300_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_300_OU = shuffle(data_300_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_300_OU.Asthma_10YR))
# Original dataset shape Counter({1: 272, 0: 272})


# Assign all training datasets to be considered for model development into data object
data = [];
data.append(data_0);
data.append(data_25_O);
data.append(data_50_O);
data.append(data_100_O);
data.append(data_150_O);
data.append(data_200_O);
data.append(data_250_O);
data.append(data_300_O);
data.append(data_0_U);
data.append(data_25_OU);
data.append(data_50_OU);
data.append(data_100_OU);
data.append(data_150_OU);
data.append(data_200_OU);
data.append(data_250_OU);
data.append(data_300_OU)

# Set should be indexed according to the number of datasets included in the object data. This will be used during model development to loop through each training dataset.  
set = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

# Import early life test data, standardised against the initial early life training dataset - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised earlylife test set"
test = pd.read_csv("Early_life_standardised_test_dataset_255IDs.csv", index_col=False)
# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']

